Diagnostic Classifier#
Load data#
Show code cell source
import pandas as pd
import sys
sys.path.append('../')
from source.pacmap_functions import *
mount = '/mnt/d/'
input_path = mount + 'MethylScore/Intermediate_Files/'
output_path = mount + 'MethylScore/Processed_Data/'
# read df_discovery and df_test
df_discovery = pd.read_pickle(
input_path+'3308samples_333059cpgs_withbatchcorrection_bvalues.pkl').sort_index()
df_test = pd.read_pickle(
input_path+'201samples_357839cpgs_withbatchcorrection_bvalues.pkl').sort_index()
# Load clinical data
discovery_clinical_data = pd.read_csv(input_path+'discovery_clinical_data.csv',
low_memory=False, index_col=0)
# Load clinical data
test_clinical_data = pd.read_csv(input_path+'validation_clinical_data.csv',
low_memory=False, index_col=0)
Show code cell source
# import train_test_split function
from sklearn.model_selection import train_test_split
discovery_clinical_data2 = discovery_clinical_data[~discovery_clinical_data['WHO 2022 Diagnosis'].isna()]
df_discovery2 = df_discovery.loc[discovery_clinical_data2.index]
# Train test split `df_discovery` into `df_train` and `df_validation`
_, df_validation = train_test_split(df_discovery2, test_size=0.1, random_state=42,
stratify=discovery_clinical_data2['WHO 2022 Diagnosis'])
# Define `df_train` as the rest of `df_discovery`
df_train = df_discovery[~df_discovery.index.isin(df_validation.index)]
# Define `train_clinical_data` and `validation_clinical_data`
train_clinical_data = discovery_clinical_data.loc[df_train.index]
validation_clinical_data = discovery_clinical_data.loc[df_validation.index]
# Adjust clinical data
train_clinical_data['Train-Validation-Test'] = 'Train Samples'
validation_clinical_data['Train-Validation-Test'] = 'Validation Samples'
test_clinical_data['Train-Validation-Test'] = 'Test Samples'
train_clinical_data['Batch'] = df_train['Batch']
validation_clinical_data['Batch'] = df_validation['Batch']
test_clinical_data['Batch'] = 'St Jude Children\'s'
Select CpGs in both train and test#
Show code cell source
# use overlapping features between df_discovery and df_test
common_features = [x for x in df_discovery.columns if x in df_test.columns]
# apply `common_features` to both df_discovery and df_test
df_train = df_train[common_features]
df_validation = df_validation[common_features]
df_test = df_test[common_features]
output_notebook()
Show code cell source
# Combine df_validation and df_test
df_validation_test = pd.concat([df_validation, df_test])
# Combine validation_clinical_data and test_clinical_data
validation_test_clinical_data = pd.concat([validation_clinical_data, test_clinical_data])
Acute Leukemia Methylome Atlas#
Show code cell source
clinical_trials = [
'NOPHO ALL92-2000',
'AAML0531',
'AAML1031',
'Beat AML Consortium',
'TCGA AML',
'CETLAM SMD-09 (MDS-tAML)',
'French GRAALL 2003–2005',
'TARGET ALL',
'AAML03P1',
'Japanese AML05',
'CCG2961'
]
sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow', 'Bone Marrow Normal',
'Primary Blood Derived Cancer - Peripheral Blood', 'Blood Derived Normal',
'Relapse','Recurrent Blood Derived Cancer - Bone Marrow',
'Recurrent Blood Derived Cancer - Peripheral Blood'
]
cols = ['Clinical Trial', 'Sample Type', 'Patient_ID', 'WHO 2022 Diagnosis',
'ELN 2022 Diagnosis','Hematopoietic Entity' ,'Train-Validation-Test', 'Batch']
# components = [2,5]
# for n in components:
# processor = DataProcessor(train_clinical_data.copy(),
# df_train,
# clinical_trials,
# sample_types,
# cols,
# n_components=n,
# common_prefix=output_path+f'pacmap_output/pacmap_{n}d_model_dx_al',
# df_test=df_validation_test.copy(),
# test_clinical_data=validation_test_clinical_data.copy())
# processor.filter_data()
# processor.apply_pacmap() # learn PaCMAP on the training data
# processor.apply_pacmap_test() # apply PaCMAP to the test data
# processor.join_labels() # join clinical data to the embedding
# # Save output
# processor.df.to_csv(output_path+f'pacmap_output/pacmap_{n}d_model_dx_al.csv')
The PaCMAP instance is successfully saved at /mnt/d/MethylScore/Processed_Data/pacmap_output/pacmap_2d_model_dx_al.pkl.
To load the instance again, please do `pacmap.load(/mnt/d/MethylScore/Processed_Data/pacmap_output/pacmap_2d_model_dx_al)`.
The PaCMAP instance is successfully saved at /mnt/d/MethylScore/Processed_Data/pacmap_output/pacmap_5d_model_dx_al.pkl.
To load the instance again, please do `pacmap.load(/mnt/d/MethylScore/Processed_Data/pacmap_output/pacmap_5d_model_dx_al)`.
Show code cell source
df = pd.read_csv(output_path+'pacmap_output/pacmap_2d_model_dx_al.csv', index_col=0)
# Concatenate discovery and validation clinical data
clinical_data = pd.concat([train_clinical_data, validation_clinical_data, test_clinical_data]).loc[df['index']]
# Select columns to plot
cols = ['WHO 2022 Diagnosis','ELN 2022 Diagnosis','Hematopoietic Entity','FAB', 'FLT3 ITD', 'Age (group years)',
'Complex Karyotype' ,'Batch', 'Sex', 'MRD 1 Status',
'Leucocyte counts (10⁹/L)', 'Risk Group', 'Race or ethnic group', 'Trisomy 8 Status',
'Clinical Trial', 'Vital Status','First Event','Sample Type', 'Train-Validation-Test']
# Join clinical data to the embedding
df = df.join(clinical_data[cols], rsuffix='_copy', on='index')
plotter = BokehPlotter(df, cols, get_custom_color_palette(),
title='',
x_range=(-45, 45), y_range=(-45, 45),
datapoint_size=4, tooltip_dx_cols='WHO 2022 Diagnosis',
width=1000, height=600)
plotter.plot()